# ==============================================================================
# Import
# ==============================================================================

library(tidyverse) 
library(purrr) 

source('code/utils.R')

serodat_blood_2021_combined <- read_csv("data/sero/2020-2021_Nationwide_Blood_Donor_Seroprevalence_Survey_Combined_Infection-_and_Vaccination-Induced_Seroprevalence_Estimates.csv")

serodat_blood_2021_infection <- read_csv("data/sero/2020-2021_Nationwide_Blood_Donor_Seroprevalence_Survey_Infection-Induced_Seroprevalence_Estimates.csv")

serodat_blood_2022 <- read_csv("data/sero/2022_Nationwide_Blood_Donor_Seroprevalence_Survey_Combined_Infection-_and_Vaccination-Induced_Seroprevalence_Estimates.csv")

serodat_commercial <- read_csv("data/sero/Nationwide_Commercial_Laboratory_Seroprevalence_Survey.csv")

# Ideal framework: 
# State - StartDate - EndDate - Survey - SeroInf - SeroComb 

# ==============================================================================
# Commercial
# ==============================================================================

serodat_commercial_ft <- serodat_commercial %>% 
	# Reduce to states and valid dates: ----------------------------------------
	filter(`Catchment Area Description`=="Statewide") %>%
	filter(!is.na(`Date Range of Specimen Collection`)) %>%  
	# Format dates: ------------------------------------------------------------
	separate_wider_delim(`Date Range of Specimen Collection`,delim="-",names=c("Date1","Date2")) %>% 
	separate_wider_delim(Date1, delim=",",names=c("Date1Date","Date1Year"),too_few="align_start") %>% 
	separate_wider_delim(Date2, delim=",",names=c("Date2Date","Date2Year"),too_few="align_start") %>% 
	mutate(Date1Year = case_when(is.na(Date1Year)~Date2Year, TRUE~Date1Year)) %>% 
	mutate(Date1=paste0(Date1Date," ",Date1Year)) %>% 
	mutate(Date2=paste0(Date2Date," ",Date2Year)) %>% 
	mutate(Date1=mdy(Date1)) %>% 
	mutate(Date2=mdy(Date2)) %>% 
	select(-Date1Date, -Date1Year, -Date2Date, -Date2Year) %>% 
	# Lengthen the data frame: -------------------------------------------------
	select(state=Site, date1=Date1, date2=Date2, # pop=`Catchment population`, 
		n=`n [Anti-N, All Ages Cumulative Prevalence, Rounds 1-30 only]`,
		rate=`Rate (%) [Anti-N, All Ages Cumulative Prevalence, Rounds 1-30 only]`,
		lwr=`Lower CI [Anti-N, All Ages Cumulative Prevalence, Rounds 1-30 only]`,
		upr=`Upper CI [Anti-N, All Ages Cumulative Prevalence, Rounds 1-30 only]`) %>% #,
		# est_cuminf=`Estimated cumulative infections count`,
		# lwr_cuminf=`Estimated cumulative infections lower CI`,
		# upr_cuminf=`Estimated cumulative infections upper CI`) %>% 
	mutate(metric="N") %>% 
	mutate(survey="Commerical")

# ==============================================================================
# 2021 combined 
# ==============================================================================

serodat_blood_2021_combined_ft <- serodat_blood_2021_combined %>% 
	# Format dates ------------------------------------------------------------
	mutate(date1=paste0(`Year and Month`,"-01")) %>% 
	mutate(date1=ymd(date1)) %>% 
	mutate(date2=date1+months(1)) %>% 
	# Extract useful columns --------------------------------------------------
	select(region=`Region Abbreviation`, date1, date2, 
		n_comb=`n [Total Prevalence]`,
		rate_comb=`Rate %[Total Prevalence]`,
		lwr_comb=`Lower CI %[Total Prevalence]`,
		upr_comb=`Upper CI  %[Total Prevalence]`) %>% 
	# Remove overall and census-regional estimates: ---------------------------
	filter(region!="All") %>% 
	filter(!grepl("^CR",region)) %>% 
	# Aggregate counts by state: ----------------------------------------------
	mutate(state=substr(region,1,2)) %>% 
	mutate(pos=rate_comb/100*n_comb) %>% 
	mutate(pos_lwr=lwr_comb/100*n_comb) %>% 
	mutate(pos_upr=upr_comb/100*n_comb) %>% 
	group_by(state,date1,date2) %>% 
	summarise(
		n_comb=sum(n_comb),
		pos=sum(pos),
		pos_lwr=sum(pos_lwr),
		pos_upr=sum(pos_upr)) %>% 
	mutate(
		rate_comb=pos/n_comb*100, 
		lwr_comb=pos_lwr/n_comb*100, 
		upr_comb=pos_upr/n_comb*100) %>% 
	select(state,date1,date2,n=n_comb,rate=rate_comb,lwr=lwr_comb,upr=upr_comb) %>% 
	mutate(metric="S") %>% 
	mutate(survey="Donor")


# ==============================================================================
# 2021 infection 
# ==============================================================================

serodat_blood_2021_infection_ft <- serodat_blood_2021_infection %>% 
	# Format dates ------------------------------------------------------------
	mutate(date1=paste0(`Year and Month`,"-01")) %>% 
	mutate(date1=ymd(date1)) %>% 
	mutate(date2=date1+months(1)) %>% 
	# Extract useful columns --------------------------------------------------
	select(region=`Region Abbreviation`, date1, date2, 
		n_inf=`n [Total Prevalence]`,
		rate_inf=`Rate %[Total Prevalence]`,
		lwr_inf=`Lower CI %[Total Prevalence]`,
		upr_inf=`Upper CI  %[Total Prevalence]`) %>% 
	# Remove overall and census-regional estimates: ---------------------------
	filter(region!="All") %>% 
	filter(!grepl("^CR",region)) %>% 
	# Aggregate counts by state: ----------------------------------------------
	mutate(state=substr(region,1,2)) %>% 
	mutate(pos=rate_inf/100*n_inf) %>% 
	mutate(pos_lwr=lwr_inf/100*n_inf) %>% 
	mutate(pos_upr=upr_inf/100*n_inf) %>% 
	group_by(state,date1,date2) %>% 
	summarise(
		n_inf=sum(n_inf),
		pos=sum(pos),
		pos_lwr=sum(pos_lwr),
		pos_upr=sum(pos_upr)) %>% 
	mutate(
		rate_inf=pos/n_inf*100, 
		lwr_inf=pos_lwr/n_inf*100, 
		upr_inf=pos_upr/n_inf*100) %>% 
	select(state,date1,date2,n=n_inf,rate=rate_inf,lwr=lwr_inf,upr=upr_inf) %>% 
	mutate(metric="N") %>% 
	mutate(survey="Donor")


# ==============================================================================
# 2022 donor 
# ==============================================================================

serodat_blood_2022_ft <- serodat_blood_2022 %>% 
	filter(!(`Geographic Area` %in% c("Overall","Northeast","Midwest","South","West"))) %>% 
	filter(Race=="Overall") %>% 
	filter(Sex=="Overall") %>% 
	filter(Age=="Overall") %>% 
	filter(Indicator %in% c("Past infection with or without vaccination","Combined seroprevalence")) %>% 
	mutate(Indicator=case_when(
		Indicator=="Combined seroprevalence"~"S",
		Indicator=="Past infection with or without vaccination"~"N", 
		TRUE~NA_character_)) %>% 
	rename(STATE=`Geographic Area`) %>% 
	convert_state("abbrev") %>% 
	mutate(date1=case_when(
		`Time Period`=="2022 Quarter 1"~ymd("2022-01-01"),
		`Time Period`=="2022 Quarter 2"~ymd("2022-04-01"),
		`Time Period`=="2022 Quarter 3"~ymd("2022-07-01"),
		`Time Period`=="2022 Quarter 4"~ymd("2022-10-01"))) %>% 
	mutate(date2=case_when(
		`Time Period`=="2022 Quarter 1"~ymd("2022-03-31"),
		`Time Period`=="2022 Quarter 2"~ymd("2022-06-30"),
		`Time Period`=="2022 Quarter 3"~ymd("2022-09-30"),
		`Time Period`=="2022 Quarter 4"~ymd("2022-12-31"))) %>% 
	select(state=ABBREV, date1, date2, n=`n (Unweighted)`, rate=`Estimate % (weighted)`, lwr=`2.5%`, upr=`97.5%`, metric=Indicator) %>% 
	mutate(survey="Donor")


# ==============================================================================
# Combine the surveys
# ==============================================================================


serodat_full <- bind_rows(
	serodat_commercial_ft, 
	serodat_blood_2021_combined_ft,
	serodat_blood_2021_infection_ft,
	serodat_blood_2022_ft)


fig_serodat <- serodat_full %>% 
	filter(state!="PR") %>% 
	filter(rate<=100) %>% 
	ggplot(aes(x=date1, y=rate, fill=state, col=metric)) + 
		geom_point(size=0.2) + 
		geom_line(alpha=0.4) + 
		theme_classic() + 
		scale_color_manual(values=c("N"="red","S"="black")) + 
		scale_x_date(date_labels="%b %Y") + 
		facet_wrap(~survey) + 
		theme(text=element_text(size=9), legend.position="none", axis.text.x=element_text(angle=60, hjust=1)) + 
		labs(x="Date", y="Percent seroprevalence")

write_csv(serodat_full, file="data/sero/serodat_full.csv")
